Let's first import all the required packages.
# general
import os
import numpy as np
import pandas as pd
import time
import re
import itertools
import pickle
# clustering algorithms, distance metrics
from sklearn.cluster import DBSCAN,KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import distance
import scipy.cluster.hierarchy as sch
# networks
import networkx as nx
from pyvis import network
from pyvis.network import Network
# plotting
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.figure_factory as ff
# corpus readers, coocurrences and word vectors
from nltk.collocations import *
import nltk
import pickle
#from cltk.stops import lat as lat_stops
#import cltk
from nltk.util import skipgrams
from nltk.lm import NgramCounter
from gensim.models import Word2Vec
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.similarities import MatrixSimilarity
from utils.data.buildCollocs import BuildCollocs
from utils.data.readCorpus import NltkCorpusFromDir, CorpusFromDir, NltkCorpusFromList
from utils.data.buildModels import BuildModels
from utils.data.collDiffs import collDiffs
from nltk.corpus.reader.plaintext import PlaintextCorpusReader, CategorizedPlaintextCorpusReader
from nltk.tokenize.simple import SpaceTokenizer, LineTokenizer
from nltk.text import Text, TextCollection
If the rebuild parameter is set to True we will rerun the most resource-consuming code instead of reading the pre-computed variables from pickles.
rebuild = False
We are interested in semantic change patterns of a set of terms related to the socio-political life, such as:
# define terms we're interested in
socio_political_terms = ["civitas", "consilium", "consul", "dux", "gens", "hostis", "imperator",
"jus", "labor", "natio", "nobilitas", "pontifex", "pontificium", "populus", "potestas", "regnum", "senatus", "sodes", "urbs"]
print(socio_political_terms)
['civitas', 'consilium', 'consul', 'dux', 'gens', 'hostis', 'imperator', 'jus', 'labor', 'natio', 'nobilitas', 'pontifex', 'pontificium', 'populus', 'potestas', 'regnum', 'senatus', 'sodes', 'urbs']
We're assigning each term a seperate colour to facilitate our analyses.
color_discrete_map_terms = { term : px.colors.qualitative.Alphabet[i] for i, term in enumerate(socio_political_terms)} # for each term we fix a color
fig = go.Figure()
fig.add_trace(go.Bar(
x = [col for col in color_discrete_map_terms.keys()],
y = [0.5 for x in range(0, len(color_discrete_map_terms)) ],
text = socio_political_terms,
textangle=90,
marker_color=[col for col in color_discrete_map_terms.values()]
))
fig.update_layout(showlegend=False, xaxis={'showgrid': False, 'visible': False}, yaxis={'showgrid': False, 'visible': False})
The corpus processing phase follows as close as possible BMG's workflow to keep models compatible. There are 2 exceptions:
# prepare the corpus
punctuation = ['.', ',', '...', ';', ':', '?', '(', ')', '-', '!', '[', ']', '"', "'", '""', '\n']
# corpus files
#dir_in = os.path.join("/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas/")
dir_input = os.path.join("/home/krzys/Kod/lvlt22/BMG/LatinISE_1/") # includes texts first omitted due to parsing issues
dir_in = os.path.join(dir_input, "preprocessed_lemmas")
dir_in_words = os.path.join(dir_input, "preprocessed_tokens")
files = os.listdir(os.path.join(dir_in))
files = [f for f in files[:] if "IT" in f]
We'll be storing corpus metadata in a data frame.
# metadata (BMG)
metadata_df = pd.read_csv(os.path.join(dir_input, 'latinise_metadata.csv'), sep = ",")
metadata_df = metadata_df[metadata_df['id'].str.startswith("IT")]
metadata_df.head()
metadata_df["date"] = metadata_df["date"].astype('int') #ensure we're working with integers
first_date = min(metadata_df.date)
last_date = 900 # BMG
Define size of the time intervals:
size_interval = 450 # BMG
So there are
n_intervals = round((last_date-first_date)/size_interval) # BMG
n_intervals
3
time intervals.
Define the time periods and split the corpus:
intervals = [None]*(n_intervals+1) # BMG
for t in range(n_intervals+1):
#print(t)
if t == 0:
intervals[t] = first_date
else:
intervals[t] = intervals[t-1]+size_interval
#print(intervals[t])
print(intervals)
periods_labels = [ str(p1) + '-' + str(p2) for p1, p2 in zip(intervals, intervals[1:]) ]
print(periods_labels)
[-450, 0, 450, 900] ['-450-0', '0-450', '450-900']
Add a column to the metadata_df for the time interval:
metadata_df['time_interval'] = ""
for t in range(len(intervals)-1):
print(t)
print(range(intervals[t],intervals[t+1]))
metadata_df_t = metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1]))]
print(metadata_df_t.date)
metadata_df.loc[metadata_df['date'].isin(range(intervals[t],intervals[t+1])),'time_interval'] = intervals[t]
metadata_df
0
range(-450, 0)
19 -9
34 -49
39 -45
42 -49
57 -80
...
635 -149
638 -107
642 -37
643 -37
649 -229
Name: date, Length: 77, dtype: int64
1
range(0, 450)
18 382
23 399
24 391
37 158
38 49
...
682 382
683 116
684 116
685 116
686 116
Name: date, Length: 235, dtype: int64
2
range(450, 900)
20 524
102 800
104 800
105 800
106 800
...
609 598
634 550
636 550
645 450
1265 533
Name: date, Length: 73, dtype: int64
| id | title | creator | date | type | file | time_interval | |
|---|---|---|---|---|---|---|---|
| 18 | IT-LAT0001 | Vulgata | Hieronymus | 382 | poetry | lat_0382_IT-LAT0001.txt | 0 |
| 19 | IT-LAT0537 | Ars amatoria | Ovidius Naso, Publius | -9 | poetry | lat_-009_IT-LAT0537.txt | -450 |
| 20 | IT-LAT0011 | S. Benedicti Regula | Benedictus Nursianus | 524 | prose | lat_0524_IT-LAT0011.txt | 450 |
| 21 | IT-LAT0012 | In psalmis Davidis expositio | Thomas Aquinas: Sanctus | 1254 | prose | lat_1254_IT-LAT0012.txt | |
| 22 | IT-LAT0014 | Adoro te devote | Thomas Aquinas: Sanctus | 1254 | poetry | lat_1254_IT-LAT0014.txt | |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 683 | IT-LAT0534_1 | De origine et situ Germanorum | Tacitus, Publius (Gaius) Cornelius | 116 | prose | lat_0116_IT-LAT0534_1.txt | 0 |
| 684 | IT-LAT0534_2 | De vita Iulii Agricolae | Tacitus, Publius (Gaius) Cornelius | 116 | prose | lat_0116_IT-LAT0534_2.txt | 0 |
| 685 | IT-LAT0534_3 | Dialogus de oratoribus | Tacitus, Publius (Gaius) Cornelius | 116 | prose | lat_0116_IT-LAT0534_3.txt | 0 |
| 686 | IT-LAT0534_4 | Historiae | Tacitus, Publius (Gaius) Cornelius | 116 | prose | lat_0116_IT-LAT0534_4.txt | 0 |
| 1265 | IT-LAT0202 | Institutiones | Iustinianus, Caesar Flavius (Imperator Iustini... | 533 | prose | lat_0533_IT-LAT0202.txt | 450 |
670 rows × 7 columns
def convert_dates(sign, date0):
if sign == "0":
if date0 == 0:
final_date = "+0000"
elif date0 < 100:
final_date = "+" + "00" + str(date0)
#print("1-final_date", final_date)
elif date0 < 1000:
final_date = "+" + "0" + str(date0)
#print("2-final_date", final_date)
else:
final_date = "+" + str(date0)
#print("3-final_date", final_date)
else:
if date0 == 0:
final_date = "+0000"
elif date0 < 100:
final_date = str(sign) + "00" + str(date0)
#print("1-final_date", final_date)
elif date0 < 1000:
final_date = str(sign) + "0" + str(date0)
#print("2-final_date", final_date)
else:
final_date = str(sign) + str(date0)
#print("3-final_date", final_date)
if final_date.startswith("+"):
final_date = final_date.replace("+", "")
return final_date
# prepare the corpus
punctuation = ['.', ',', '...', ';', ':', '?', '(', ')', '-', '!', '[', ']', '"', "'", '""', '\n', '']
# define corpus subset
corpus_subset = metadata_df[metadata_df['date'] <= last_date].copy().reset_index(drop=True)
filenames_subset = corpus_subset['file'] # filenames were defined above to get IT files only
class NltkCorpusFromDirNew(PlaintextCorpusReader):
"A subclass of NLTK PlaintextCorpusReader"
word_tokenizer=SpaceTokenizer() # tokenize on whitespace
sent_tokenizer=LineTokenizer() # assume sentence per line
def __init__(
self,
root,
fileids,
encoding="utf8",
word_tokenizer=word_tokenizer,
sent_tokenizer=sent_tokenizer,
tolower=False, punctuation=None
):
PlaintextCorpusReader.__init__(self, root=root, fileids=fileids, encoding=encoding,
word_tokenizer=word_tokenizer,
sent_tokenizer=sent_tokenizer)
self.tolower = tolower
self.punctuation = punctuation
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
if self.punctuation is not None:
words.extend( [ token.lower() if self.tolower == True else token for token
in self._word_tokenizer.tokenize(stream.readline())
if token not in self.punctuation and token != ''
])
else:
words.extend( [ token.lower() if self.tolower == True else token for token in self._word_tokenizer.tokenize(stream.readline()) ])
return words
#prepare the corpus
latinise = NltkCorpusFromDirNew(root=dir_in, fileids=filenames_subset,
punctuation=punctuation, tolower=True)
latinise_docs = []
for fileid in latinise.fileids():
latinise_docs.append(Text(latinise.words(fileid)))
print("This corpus contains ", len(latinise_docs), " documents.")
This corpus contains 385 documents.
corpus = list()
for sent in latinise.sents():
corpus.append(token.lower() for token in sent if token not in punctuation and token != '')
The corpus is splitted into slices, each covering size_interval years.
# dictionary that maps a time interval with the list of sentences of texts in that time interval"
time2corpus = dict()
# I loop over all time intervals:
#for t in range(n_intervals+1): # remove redundant 900 interval
for t in range(n_intervals):
files_corpus_t = list(corpus_subset.loc[corpus_subset['time_interval'] == intervals[t]]["file"])
print("retrieving the subcorpus for interval ", intervals[t])
sents = latinise.sents(fileids=files_corpus_t)
sents_clean = list()
for sent in sents:
sents_clean.append( [ token.lower() for token in sent if token not in punctuation and token != '' ] )
time2corpus[t] = sents_clean
retrieving the subcorpus for interval -450 retrieving the subcorpus for interval 0 retrieving the subcorpus for interval 450
The time2corpus variable is a dictionary with time slices as keys. Each item is a list of sentences, each being a list of lemmas.
print(f'Dictionary keys are: { [ period for period in time2corpus.keys()] }')
print('First 3 sentences from the 3rd corpus slice are: ', time2corpus[2][0:2])
Dictionary keys are: [0, 1, 2] First 3 sentences from the 3rd corpus slice are: [['obsculta', 'o', 'filius', 'praeceptum', 'magister', 'et', 'inclino', 'auris', 'cor', 'tuus', 'et', 'admonitio', 'pius', 'pater', 'libet', 'excipe', 'et', 'efficaciter', 'comple', 'ut', 'ad', 'is', 'per', 'oboedientia', 'labor', 'redeo', 'ab', 'quo', 'per', 'inoboedientia', 'desidia', 'recedo'], ['ad', 'tu', 'ergo', 'nunc', 'ego', 'sermo', 'dirigo', 'quisquis', 'abrenuntio', 'proprius', 'voluntas', 'dominus', 'christus', 'verus', 'rex', 'militaturus', 'oboedientia', 'fortis', 'atque', 'praeclarus', 'arma', 'sumo']]
We're going to use functions available in the nltk package. In order to do so, we first need (1) to convert corpora into the nltk-compatible format. Next, from each corpus, we are (2) retrieving lists of n-grams which (3) we feed to the so-called finders which count n-grams, filter out stopwords, and apply association strength measures to frequency counts.
corpus_nltk = latinise
print(f"This corpus contains {len(corpus_nltk.sents())} sentences and {len(corpus_nltk.words())} words.")
This corpus contains 318340 sentences and 5298018 words.
First, we're retrieving 50 collocations computed on non-contiguous 5-grams of each term in the entire corpus.
if rebuild == True:
collocs = list()
for term in socio_political_terms:
print(f"\nBuilding finder for the term: {term}")
colls = BuildCollocs(corpus_nltk, term=term, window=5, filtering=True, top=50)
colls.getFinder()
print(f"Getting top 50 collocations for the term: {term}")
tops = colls.getAllNtops()
collocs.append((term, tops))
# saving collocation sets for the next generations
with open('collocations_all.pickle', 'wb') as f:
pickle.dump(collocs,f)
elif rebuild == False:
with open('out/models/collocations_all.pickle', 'rb') as f:
collocs = pickle.load(f)
Let's inspect the data structure: each 2-tuple contains:
print("a term ==> ", collocs[0][0],
"\n", "a list of collocation tuples ==> ", collocs[0][1][0] )
a term ==> civitas
a list of collocation tuples ==> ('chi_sq', [('civitas', 'velovocorum'), ('in', 'civitas'), ('civitas', 'restaurasse'), ('alexandrinae', 'civitas'), ('antiochenae', 'civitas'), ('civitas', 'muratus'), ('primoris', 'civitas'), ('princeps', 'civitas'), ('civitas', 'exterminari'), ('alexandrina', 'civitas'), ('civitas', 'hrofi'), ('inmunitates', 'civitas'), ('platea', 'civitas'), ('rages', 'civitas'), ('civitas', 'lundonia'), ('civitas', 'pergamena'), ('singulasque', 'civitas'), ('augustofratensis', 'civitas'), ('bergistanorum', 'civitas'), ('bitoricae', 'civitas'), ('cimeliarchio', 'civitas'), ('civitas', 'arbee'), ('civitas', 'auferatis'), ('civitas', 'carturi'), ('civitas', 'galaditidis'), ('civitas', 'pirisaboram'), ('clypeam', 'civitas'), ('devastabunt', 'civitas'), ('eboracae', 'civitas'), ('efron', 'civitas'), ('gaditana', 'civitas'), ('singidunum', 'civitas'), ('veronamque', 'civitas'), ('civitas', 'david'), ('coriolos', 'civitas'), ('civitas', 'dono'), ('civitas', 'palmarum'), ('civitas', 'nazareth'), ('civitas', 'iudas'), ('porta', 'civitas'), ('graecia', 'civitas'), ('civitas', 'regnavitque'), ('finitimus', 'civitas'), ('amison', 'civitas'), ('andegavis', 'civitas'), ('astensis', 'civitas'), ('carnain', 'civitas'), ('civitas', 'aegyptiacam'), ('civitas', 'brittiis'), ('civitas', 'irruptus')])
Now we'll retrieve collocations for every period of the time-segmented corpus.
if rebuild == True:
# retrieve collocations for time slices
collocs_time = list()
for key, corp in time2corpus.items():
# read the corpus into an NLTK-compatible format
corp_nltk = NltkCorpusFromList(corp)
print(f"\nBuilding for the corpus: {key}")
print(f"This corpus contains {len(corp_nltk.sents())} sentences and {len(corp_nltk.words())} words.")
for term in socio_political_terms:
print(f"\nBuilding finder for the term: {term}")
colls = BuildCollocs(corp_nltk, term=term, window=5, filtering=True, top=50)
colls.getFinder()
print(f"Getting top 50 collocations for the term: {term}")
tops = colls.getAllNtops()
collocs_time.append((key, term, tops))
# saving collocation sets for the next generations
with open('collocations_all_time.pickle', 'wb') as f:
pickle.dump(collocs_time,f)
elif rebuild == False:
with open('out/models/collocations_all_time.pickle', 'rb') as f:
collocs_time = pickle.load(f)
The structure of the collocs_time is similar to collocs, as the list contains 3-tuples of (period, term, list_of_collocs):
[ ( period_0, term_0, [(measure_0, [collocs_0])] ) ]
print("period ==> ", collocs_time[0][0], "\n",
"term ==> ", collocs_time[0][1], "\n",
"a list of collocation tuples ==> ", collocs_time[0][2][0] )
period ==> 0
term ==> civitas
a list of collocation tuples ==> ('chi_sq', [('civitas', 'dono'), ('princeps', 'civitas'), ('primoris', 'civitas'), ('bergistanorum', 'civitas'), ('gaditana', 'civitas'), ('foederatus', 'civitas'), ('in', 'civitas'), ('status', 'civitas'), ('civitas', 'donarunt'), ('civitas', 'pergamena'), ('absque', 'civitas'), ('altiusque', 'civitas'), ('amantini', 'civitas'), ('ascribi', 'civitas'), ('avaritiaque', 'civitas'), ('aveniensem', 'civitas'), ('calventi', 'civitas'), ('ccicc', 'civitas'), ('certim', 'civitas'), ('civitas', 'admonitosque'), ('civitas', 'amicitae'), ('civitas', 'ancillaris'), ('civitas', 'annitebatur'), ('civitas', 'apelaurum'), ('civitas', 'caeritem'), ('civitas', 'camertinum'), ('civitas', 'capenatiumque'), ('civitas', 'celeiates'), ('civitas', 'cerdiciatesque'), ('civitas', 'chaldaeicum'), ('civitas', 'classiumque'), ('civitas', 'conmunicatis'), ('civitas', 'conpensatio'), ('civitas', 'delerant'), ('civitas', 'depeculatus'), ('civitas', 'descendissetque'), ('civitas', 'dicarit'), ('civitas', 'dictique'), ('civitas', 'discive'), ('civitas', 'donatus'), ('civitas', 'epiros'), ('civitas', 'exinanitae'), ('civitas', 'explorandumque'), ('civitas', 'exterminarint'), ('civitas', 'extrueret'), ('civitas', 'foederatisque'), ('civitas', 'frusinates'), ('civitas', 'gemituque'), ('civitas', 'gravisque'), ('civitas', 'hampsicoram')])
Let's now convert these data structures to dictionary for easier access.
# convert list of tuples into dict
collocs_dict = {}
for item in collocs:
term = item[0]
collocs_set = item[1]
if collocs_dict.setdefault(term) is None:
collocs_dict.setdefault(term)
collocs_dict[term] = {}
for coll in collocs_set:
coeff = coll[0] # dice etc.
colls = coll[1] # a list of collocs
colls_only = [ list(filter(lambda w: w!=term, bigram)) for bigram in coll[1] ] # only collocates
colls_only = [ x[0] if len(x) > 0 else term for x in colls_only ] # restores collocates which = term
collocs_dict[term].setdefault(coeff, [])
collocs_dict[term][coeff] = []
collocs_dict[term][coeff].append(colls)
collocs_dict[term][coeff].append(colls_only)
The collocations for each term and association_measure may be accessed as:
collocs_dict[term][association_measure].
For example:
# inspect the dictionary
print(collocs_dict["civitas"]["dice"])
[[('princeps', 'civitas'), ('in', 'civitas'), ('civitas', 'david'), ('porta', 'civitas'), ('de', 'civitas'), ('civitas', 'iudas'), ('civitas', 'dono'), ('civitas', 'romanus'), ('noster', 'civitas'), ('civitas', 'suus'), ('per', 'civitas'), ('totus#2', 'civitas'), ('idem', 'civitas'), ('apud', 'civitas'), ('provincia', 'civitas'), ('graecia', 'civitas'), ('universus', 'civitas'), ('episcopus', 'civitas'), ('ad', 'civitas'), ('civitas', 'et'), ('omnis', 'civitas'), ('civitas', 'quis#2'), ('civitas', 'noster'), ('multus', 'civitas'), ('singulus', 'civitas'), ('murus', 'civitas'), ('et', 'civitas'), ('civitas', 'capio'), ('hic', 'civitas'), ('civitas', 'civitas'), ('civitas', 'sanctus'), ('civitas', 'in'), ('civitas', 'rex'), ('civitas', 'do'), ('civitas', 'mitto'), ('civitas', 'omnis'), ('civitas', 'cum'), ('primoris', 'civitas'), ('civitas', 'episcopus'), ('asia', 'civitas'), ('liber', 'civitas'), ('rex', 'civitas'), ('usque', 'civitas'), ('civitas', 'atque'), ('venio', 'civitas'), ('quidam', 'civitas'), ('aedifico', 'civitas'), ('hierosolyma', 'civitas'), ('civitas', 'magnus'), ('civitas', 'murus')], ['princeps', 'in', 'david', 'porta', 'de', 'iudas', 'dono', 'romanus', 'noster', 'suus', 'per', 'totus#2', 'idem', 'apud', 'provincia', 'graecia', 'universus', 'episcopus', 'ad', 'et', 'omnis', 'quis#2', 'noster', 'multus', 'singulus', 'murus', 'et', 'capio', 'hic', 'civitas', 'sanctus', 'in', 'rex', 'do', 'mitto', 'omnis', 'cum', 'primoris', 'episcopus', 'asia', 'liber', 'rex', 'usque', 'atque', 'venio', 'quidam', 'aedifico', 'hierosolyma', 'magnus', 'murus']]
Let's assume that the overlap between collocational sets of two or more words is indicative of their (dis)similarity. In this section, we are investigating synchronic collocation sets, that is collocations retrieved from the entire corpus. In the next sections, we'll be taking a closer look at diachronic overlap.
We choose for further analyses collocates retrieved with Dice coefficient as it usually yields the most interpretable results in manual corpus analysis (ie. content words, easy to understand syntagmatic and paradigmatic relation to node word).
labels = []
coll_sets = []
for coll_set in collocs_dict.items():
labels.append(coll_set[0])
coll_sets.append(coll_set[1]["dice"][1])
df = collDiffs.collDf(coll_sets, labels) #the df variable will be recycled
df.head()
| colloc | slice | rank | |
|---|---|---|---|
| 0 | princeps | civitas | 1 |
| 1 | in | civitas | 2 |
| 2 | david | civitas | 3 |
| 3 | porta | civitas | 4 |
| 4 | de | civitas | 5 |
Let's look which terms overlap the most and the least.
# plotting looong ranking tables
def showLongTable(dataframe, show=True, color=None, colormap = None):
dataframe.iteritems()
rowcolors = None
if color is not None and colormap is not None:
rowcolors = [ colormap[i] for i in dataframe[color] ]
tbl = go.Table(
header=dict(values=list(dataframe.columns),
#fill_color='white',
align='center'),
cells=dict(values= [data for (col, data) in dataframe.iteritems()],
fill_color=[rowcolors, "white", "white"],
line_color=[rowcolors] if colormap is not None else None,
align='center'))
return go.Figure(data=[tbl]).show() if show==True else tbl
# most ...
fig1 = showLongTable(collDiffs.getNTop(df, top=5, ascending=False), show=False,
color="node",
colormap=color_discrete_map_terms)
# ... and least similar by number of overlapping collocations
fig2 = showLongTable(collDiffs.getNTop(df, top=5, ascending=True), show=False,
color="node",
colormap=color_discrete_map_terms)
# plot both tables
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "table"},{"type": "table"}]])
fig.add_trace(fig1, row=1, col=1)
fig.add_trace(fig2, row=1, col=2)
fig.update_layout(height=800,
title_text="5 most and least similar terms by number of overlapping collocates\
<br><sup>(in the 'least' table only 5 results are printed)</sup>"
)
fig.show()
In terms of collocational overlap count, the most similar pairs of terms in our set are:
# 10 most similar word pairs in the set (by colloc overlap)
sims_all = collDiffs.getNTop(df, top=-1, ascending=False).sort_values("count",ascending=False).drop_duplicates(
subset=["node","collocate"])
sims_all[sims_all["node"] != sims_all["collocate"]]
dupes = sims_all.apply(frozenset,axis=1).duplicated()
sims = sims_all[~dupes]
prettyprint = True
sims.nlargest(10,"count").style.background_gradient() if prettyprint else sims.nlargest(10,"count")
| node | count | collocate | |
|---|---|---|---|
| 68 | gens | 15 | populus |
| 0 | civitas | 13 | populus |
| 1 | civitas | 13 | urbs |
| 2 | civitas | 12 | gens |
| 85 | hostis | 11 | dux |
| 187 | pontifex | 10 | consul |
| 307 | urbs | 10 | hostis |
| 17 | consilium | 9 | populus |
| 224 | populus | 9 | senatus |
| 71 | gens | 7 | regnum |
The least similar pairs, on the other hand, are:
# 10 most dissimilar terms in the set (by colloc overlap)
sims.nsmallest(10,"count").style.background_gradient() if prettyprint else sims.nsmallest(10,"count")
| node | count | collocate | |
|---|---|---|---|
| 300 | sodes | 0 | hostis |
| 271 | regnum | 0 | pontifex |
| 321 | urbs | 0 | pontifex |
| 320 | urbs | 0 | nobilitas |
| 319 | urbs | 0 | natio |
| 270 | regnum | 0 | hostis |
| 318 | urbs | 0 | jus |
| 252 | potestas | 0 | labor |
| 288 | senatus | 0 | pontifex |
| 285 | senatus | 0 | labor |
Let's inspect the collocational overlap of the entire set of terms (~ semantic similarity).
# plot similarity matrix
heatmap = collDiffs.plotCollDf(df, show=False)
heatmap.update_xaxes(title="Term").update_yaxes(title="Term")
heatmap.update_layout(height=800,
title_text="Number of overlapping collocations")
heatmap.show()
# TODO: normalize counts
# TODO: overlaps by the collocate's rank
# TODO: limit display to count > 1
Distributional similarity via collocational overlap may be used to discover term clusters.
# default linkage method = complete
dendro1 = ff.create_dendrogram(collDiffs.all2all(coll_sets)[2],
orientation='left', labels=labels)
# let's switch linkage method to Ward
dendro2 = ff.create_dendrogram(collDiffs.all2all(coll_sets)[2],
orientation='left', labels=labels, linkagefun=lambda x: sch.linkage(x,'ward'))
fig = make_subplots(rows=1, cols=2, horizontal_spacing=0.3)
for data in dendro1['data']:
fig.add_trace(data, row=1,col=1)
fig.update_xaxes(
title_text="linkage = complete", row=1, col=1,ticktext=dendro1.layout.yaxis.ticktext,
showticklabels=False)
fig.update_yaxes(row=1, col=1,ticktext=dendro1.layout.yaxis.ticktext, tickvals=dendro1.layout.yaxis.tickvals)
for data in dendro2['data']:
fig.add_trace(data, row=1, col=2)
fig.update_xaxes(
title_text="linkage = Ward", row=1, col=2,
showticklabels=False)
fig.update_yaxes(row=1, col=2,ticktext=dendro2.layout.yaxis.ticktext, tickvals=dendro2.layout.yaxis.tickvals)
fig.update_layout(title="Similar terms clustered by the number of overlapping collocates",
showlegend=False, height=500, width=900)
fig.show()
Theoretically, each of the socio_political_terms can have (len(socio_political_terms) -1) * ntop (where ntop is a number of top collocates taken into consideration) overlapping collocates. This fact may be used to gauge both the semantic coherence of the set as well as "prototypicality" of each of the terms or their semantic distance from other terms in the set.
Let's inspect total overlap counts for all terms in the set.
# terms by total number of overlapping collocates (~ set coherence)
sims_all_sum = sims_all[["node", "count"]].groupby("node").sum().reset_index().sort_values("count", ascending=False)
sims_med = sims_all_sum["count"].median()
fig = px.bar(sims_all_sum, x='node', y='count',
#orientation="h",
category_orders=[{"node":sims_all_sum["node"]}],
color_discrete_map=color_discrete_map_terms)
fig.add_hline(y=sims_med, line_color="red", line_dash="dash",
annotation_text="median = "+str(sims_med),
annotation_position="bottom right",annotation_font_color="red")
fig.update_layout(title="Overall number of overlapping collocates", height=400)
fig.show()
# TODO: check how it changes with ranks = [10, 20, ...]
Let's now turn to time-aware collocation counts.
As before, to make data manipulation easier, we'll start with converting collocation lists to a Python dictionary.
# convert list of tuples into dict
collocs_time_dict = {}
for item in collocs_time:
period = item[0]
term = item[1]
if collocs_time_dict.setdefault(period) is None:
collocs_time_dict.setdefault(period)
collocs_time_dict[period] = {}
collocs_time_dict[period].setdefault(term, {})
#print(collocs_time_dict)
for coll in item[2]:
coeff = coll[0] # dice etc.
colls = coll[1] # a list of collocs
collocs_time_dict[period][term].setdefault(coeff, [])
colls_only = [ list(filter(lambda w: w!=term, bigram)) for bigram in coll[1] ] # only collocates
colls_only = [ x[0] if len(x) > 0 else term for x in colls_only ] # restores collocates which = term
# collocs_time_dict[period][term][coeff] = colls
collocs_time_dict[period][term][coeff].append(colls)
collocs_time_dict[period][term][coeff].append(colls_only)
The collocations for each period, term and association_measure may be accessed as:
collocs_time_dict[period][term][association_measure]. The 2-tuple thus retrieved contains the original list of 2-grams and a simplified list of collocates.
For example:
# inspect the dictionary
print("original list ==> ", collocs_time_dict[0]["civitas"]["dice"][0], "\n\n", "collocates only ==> ", collocs_time_dict[0]["civitas"]["dice"][1])
original list ==> [('princeps', 'civitas'), ('civitas', 'dono'), ('noster', 'civitas'), ('in', 'civitas'), ('graecia', 'civitas'), ('civitas', 'suus'), ('liber', 'civitas'), ('status', 'civitas'), ('primoris', 'civitas'), ('hic', 'civitas'), ('de', 'civitas'), ('ex', 'civitas'), ('asia', 'civitas'), ('civitas', 'noster'), ('civitas', 'do'), ('muto', 'civitas'), ('civitas', 'mitto'), ('civitas', 'status'), ('unus', 'civitas'), ('civitas', 'princeps'), ('omnis', 'civitas'), ('civitas', 'quis#2'), ('alius', 'civitas'), ('jus', 'civitas'), ('ceterus', 'civitas'), ('nullus', 'civitas'), ('civitas', 'impero'), ('libertas', 'civitas'), ('vir', 'civitas'), ('universus', 'civitas'), ('rex', 'civitas'), ('civitas', 'sum'), ('finitimus', 'civitas'), ('lex', 'civitas'), ('civitas', 'mos'), ('totus#2', 'civitas'), ('civitas', 'non'), ('civitas', 'rex'), ('summus', 'civitas'), ('civitas', 'omnis'), ('civitas', 'hic'), ('et', 'civitas'), ('is', 'civitas'), ('civitas', 'teneo'), ('civitas', 'libertas'), ('civitas', 'atque'), ('civitas', 'legatus'), ('civitas', 'tollo'), ('homo', 'civitas'), ('consilium', 'civitas')]
collocates only ==> ['princeps', 'dono', 'noster', 'in', 'graecia', 'suus', 'liber', 'status', 'primoris', 'hic', 'de', 'ex', 'asia', 'noster', 'do', 'muto', 'mitto', 'status', 'unus', 'princeps', 'omnis', 'quis#2', 'alius', 'jus', 'ceterus', 'nullus', 'impero', 'libertas', 'vir', 'universus', 'rex', 'sum', 'finitimus', 'lex', 'mos', 'totus#2', 'non', 'rex', 'summus', 'omnis', 'hic', 'et', 'is', 'teneo', 'libertas', 'atque', 'legatus', 'tollo', 'homo', 'consilium']
From the orignal dictionary, we're deriving a number of data containers, mainly to simplify visualization.
collocs_time_by_term = dict()
for period, collocs in collocs_time_dict.items():
for term in collocs.keys():
#period_lbl="period"+str(period)
period_lbl=periods_labels[period]
collocs_time_by_term.setdefault(term,{})
collocs_time_by_term[term].setdefault(period_lbl, [])
collocs_time_by_term[term][period_lbl] = collocs[term]["dice"][1]
print("collocs_time_by_term is a Python ", type(collocs_time_by_term), "\n" , "collocs_time_by_term[term][period] ==> a list of the collocates of the term in specific period")
collocs_time_by_term is a Python <class 'dict'> collocs_time_by_term[term][period] ==> a list of the collocates of the term in specific period
# generate (node,collocate,rank) dataframes
collocs_time_by_term_dfs = dict.fromkeys(collocs_time_by_term.keys()) # dictionary of overlap counts
periods = []
for term in collocs_time_by_term_dfs.keys():
coll_sets_time = []
for period, colls in collocs_time_by_term[term].items():
periods.append(period)
coll_sets_time.append(colls)
collocs_time_by_term_dfs[term]= collDiffs.collDf(coll_sets = coll_sets_time, labels=periods)
print("collocs_time_by_term_dfs is a ", type(collocs_time_by_term_dfs), "\n" , "collocs_time_by_term_dfs[term] ==> df of the collocates with time period and rank")
collocs_time_by_term_dfs["civitas"].head()
collocs_time_by_term_dfs is a <class 'dict'> collocs_time_by_term_dfs[term] ==> df of the collocates with time period and rank
| colloc | slice | rank | |
|---|---|---|---|
| 0 | princeps | -450-0 | 1 |
| 1 | dono | -450-0 | 2 |
| 2 | noster | -450-0 | 3 |
| 3 | in | -450-0 | 4 |
| 4 | graecia | -450-0 | 5 |
Several contain explicit overlap counts.
# overlap counts: by term by period
collocs_time_by_term_mats = dict.fromkeys(collocs_time_by_term.keys()) # dictionary of overlap counts
for term in collocs_time_by_term_mats.keys():
coll_sets_time = []
periods = []
for period, colls in collocs_time_by_term[term].items():
periods.append(period)
coll_sets_time.append(colls)
collocs_time_by_term_mats[term]= collDiffs.all2all(coll_sets = coll_sets_time)[2]
print("collocs_time_by_term_mats is a ", type(collocs_time_by_term_mats), "\n" ,
"collocs_time_by_term_mats[term] ==> 2D matrix of overlap counts (n_periods, n_periods);\ each row represents total overlap counts of a specific period with other periods\n")
for i, overlap in enumerate(collocs_time_by_term_mats["civitas"]):
print("period: ", i, "overlap count: ", overlap)
collocs_time_by_term_mats is a <class 'dict'> collocs_time_by_term_mats[term] ==> 2D matrix of overlap counts (n_periods, n_periods);\ each row represents total overlap counts of a specific period with other periods period: 0 overlap count: [43 14 7] period: 1 overlap count: [14 42 18] period: 2 overlap count: [ 7 18 45]
# overlap counts: by term: (preceding, following)
collocs_time_by_term_overlap = dict.fromkeys(collocs_time_by_term.keys(),[])
for term in collocs_time_by_term_overlap.keys():
term_overlap = list() # number of overlapping collocates between (preceding, following) periods
for i, overlap in enumerate(collocs_time_by_term_mats[term]):
pre = overlap[i-1] if i > 0 else None
post = overlap[i+1] if i < len(overlap)-1 else None
term_overlap.append((pre,post))
collocs_time_by_term_overlap[term] = term_overlap
# None is set for extreme left and right
#collocs_time_by_term_overlap["civitas"]
for i, overlap in enumerate(collocs_time_by_term_overlap["civitas"]):
print("period: ", i, "==> overlap with preceding and following period: ", overlap)
period: 0 ==> overlap with preceding and following period: (None, 14) period: 1 ==> overlap with preceding and following period: (14, 18) period: 2 ==> overlap with preceding and following period: (18, None)
# long-format df with overlap counts: term | t_i-1 | t_i | overlap count
overs = []
for term in collocs_time_by_term_mats.keys():
for period1, overlaps in zip(periods, collocs_time_by_term_mats[term]):
for period2, overlap in zip(periods, overlaps):
if period1 != period2:
overs.append([term, period1, period2, overlap])
overs_df = pd.DataFrame(overs, columns=["term", "source", "target", "count"])
dupes = overs_df.apply(frozenset,axis=1).duplicated() #filter out dupes
overs_df = overs_df[~dupes]
overs_df["term"] = overs_df["term"].astype("category")
# we're making sure the categories are encoded in the same way
overs_df["target"] = pd.Series(overs_df["target"]).astype('category').cat.set_categories(periods)
overs_df["source"] = pd.Series(overs_df["source"]).astype('category').cat.set_categories(periods)
overs_df.head()
#overs_df.tail()
| term | source | target | count | |
|---|---|---|---|---|
| 0 | civitas | -450-0 | 0-450 | 14 |
| 1 | civitas | -450-0 | 450-900 | 7 |
| 3 | civitas | 0-450 | 450-900 | 18 |
| 6 | consilium | -450-0 | 0-450 | 19 |
| 7 | consilium | -450-0 | 450-900 | 5 |
We assume that diachronic collocational overlap (ie. the number of shared collocations between period t_i and t_j) is proportional to semantic similarity of word occurrences and thus indicates the degree of semantic change.
Let's explore this, first, by plotting an overlap (ie. similarity) matrix for each term in our set.
# we'll first define plotting function to facilitate multiplot generation
def build_multiplot(cols, subplot_type, n_items, subplot_titles, **kwargs):
rows = divmod(n_items, cols)[0] + 1 if divmod(n_items, cols)[1] > 0 else divmod(n_items, cols)[0]
rows_cols = [ (col, row) for col, row in itertools.product(range(1,rows+1), range(1,cols+1)) ]
specs = [ [ {"type": subplot_type} for col in range(1, cols+1) ] for i in range(1, rows+1) ] if subplot_type is not None else None
fig = make_subplots(
rows=rows, cols=cols,
subplot_titles=subplot_titles if subplot_titles is not None else None,
specs = specs,
**kwargs
)
return fig, rows_cols
# plot heatmaps for every term
heatmaps = {}
for term, df in collocs_time_by_term_dfs.items():
fig = collDiffs.plotCollDf(df, show=False)
fig=fig.update_layout(title="Collocational overlap: "+ term)
heatmaps.setdefault(term,fig)
fig, rows_cols = build_multiplot(2, None, len(heatmaps), [ term for term in heatmaps.keys() ],
shared_yaxes=True, shared_xaxes=False, vertical_spacing=0.04)
for i, heatmap in enumerate(heatmaps.items()):
fig.add_trace(heatmap[1]["data"][0], row=rows_cols[i][0], col=rows_cols[i][1])
fig.update_layout(height=1600, showlegend=True)
fig.update_xaxes(
type="category",
#title_text="Period",
categoryorder="category ascending",
)
fig.update_yaxes(type="category",
#title_text="Period",
categoryorder="category descending")
fig.show()
selected="civitas"
# plot heatmaps for a single term
fig = collDiffs.plotCollDf(collocs_time_by_term_dfs[selected], show=False)
fig=fig.update_layout(title="Collocational overlap: "+ selected)
fig.update_xaxes(
type="category",
#title_text="Period",
categoryorder="category ascending",
)
fig.update_yaxes(type="category",
#title_text="Period",
categoryorder="category descending")
fig.update_layout(height=400, width=400, showlegend=False)
fig.show()
collocs_time_by_term_dfs[selected].head()
selected_collocs = collocs_time_by_term_dfs[selected].groupby("slice")["colloc"].apply(lambda x: x.tolist())
selected_collocs_diff = collDiffs.all2all(selected_collocs)
%pip install matplotlib-venn
Requirement already satisfied: matplotlib-venn in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (0.11.7) Requirement already satisfied: scipy in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib-venn) (1.8.1) Requirement already satisfied: matplotlib in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib-venn) (3.5.2) Requirement already satisfied: numpy in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib-venn) (1.22.4) Requirement already satisfied: pyparsing>=2.2.1 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (2.8.2) Requirement already satisfied: fonttools>=4.22.0 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (4.34.4) Requirement already satisfied: cycler>=0.10 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (0.11.0) Requirement already satisfied: kiwisolver>=1.0.1 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (1.4.4) Requirement already satisfied: pillow>=6.2.0 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (9.2.0) Requirement already satisfied: packaging>=20.0 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from matplotlib->matplotlib-venn) (21.3) Requirement already satisfied: six>=1.5 in /media/HOME_DATA/miniconda3/envs/lvlt22/lib/python3.10/site-packages (from python-dateutil>=2.7->matplotlib->matplotlib-venn) (1.16.0) Note: you may need to restart the kernel to use updated packages.
from matplotlib_venn import venn3, venn3_circles
tmp = venn3([set(colls) for colls in selected_collocs], periods_labels)
plt.show()
venn_colors = [ circle.get_facecolor() for circle in tmp.patches ]
# printing collocs in every Venn's subset (following: https://github.com/konstantint/matplotlib-venn/blob/4e0d418ef9f7c9079aeb1139ae185b3b34fa2068/matplotlib_venn/_venn3.py#L219)
a, b, c = [ set(x) for x in selected_collocs ]
selected_collocs_diffs = []
selected_collocs_diffs.append(a - (b | c))
selected_collocs_diffs.append(b - (a | c))
selected_collocs_diffs.append((a & b) - c)
selected_collocs_diffs.append(c - (a | b))
selected_collocs_diffs.append((a & c) - b)
selected_collocs_diffs.append((b & c) - a)
selected_collocs_diffs.append(a & b & c)
for l in selected_collocs_diffs:
print(len(l))
print(*sorted(l), sep=", ")
27 alius, asia, atque, ceterus, consilium, ex, finitimus, homo, impero, jus, legatus, lex, liber, libertas, mitto, mos, muto, non, nullus, primoris, status, sum, summus, teneo, tollo, unus, vir 15 aedifico, civitas, david, extra, fugio, hierosolyma, israel, iudas, magnus, munio, platea, regnum, sanctus, urbs, villa 9 do, dono, et, graecia, is, noster, omnis, princeps, universus 25 apud, classis, coepio, cum, curialis, defensor, dux, egredior, episcopus, idem, igitur, infra, ipse, italia, iuxta, proprius, provincia, quoque, ravenna, regia, reliquus, ubi, vel, velovocorum, verus 2 hic, rex 13 ad, capio, ingredior, intro, multus, murus, per, porta, quidam, romanus, singulus, usque, venio 5 de, in, quis#2, suus, totus#2
# period_i:period_i+1 overlap dataframe
def simple_overlap(overlap_dict, periods):
periods_diffs = [ x+" : "+y for x,y in itertools.pairwise(periods)]
overlaps = list()
for term, overlap_arr in overlap_dict.items():
#overlaps.setdefault(term,)
for i, per_diff in enumerate(overlap_arr[0:len(overlap_arr)-1]):
overlaps.append([term, periods_diffs[i], per_diff[1]])
df = pd.DataFrame(overlaps)
df.columns = ["term", "period", "overlap"]
return df
df = simple_overlap(collocs_time_by_term_overlap, periods)
# all
fig_all = px.line(df, x="period",y="overlap",
color="term", color_discrete_map=color_discrete_map_terms,
) # filter out empty period
fig_all.update_layout(title="Number of overlapping collocations", height=600)
fig_all.show()
# TODO: plot change ratio instead of change count
# by term
selected_s = ['civitas', 'consilium', 'senatus', 'hostis', 'imperator', 'natio', 'pontifex', 'potestas']
df_term = df
#df_term["period"] = [ for t1, t2 in df_term["period"].map(lambda x: x.split(':')) ] #shorter labels
if prettyprint:
fig_by_term = px.line(df_term[df_term["term"].isin(selected_s)],
x="period", y="overlap", color="term", # filter out empty period
color_discrete_map=color_discrete_map_terms,
facet_col="term", facet_col_wrap=3,
)
fig_by_term.update_layout(title="Number of overlapping collocations", height=400, showlegend=False)
fig_by_term.update_xaxes(ticks='', showticklabels=False, tickangle=45, tickfont=dict(size=8))
else:
fig_by_term = px.line(df_term,
x="period", y="overlap", color="term", # filter out empty period
color_discrete_map=color_discrete_map_terms,
facet_col="term", facet_col_wrap=5,
)
fig_by_term.update_layout(title="Number of overlapping collocations", height=800, showlegend=False)
fig_by_term.update_xaxes(ticks='', showticklabels=True, tickangle=45, tickfont=dict(size=8))
fig_by_term.show()
# by period ()
fig = px.box(df,
y="overlap", facet_col="period", facet_col_wrap=4, facet_col_spacing=0,
notched=True, points="all", hover_data=["term", "overlap", "period"],
color="period", boxmode="overlay",
)
fig.update_traces(jitter=0, showlegend=False)
fig.update_layout(title="Variation of the overlap counts (by period)", height=400, showlegend=False)
fig.show()
if prettyprint:
# by word
fig = px.box(df[df["term"].isin(selected_s)],
y="overlap", x="term",
#facet_col="period",
notched=False,
points="all",
hover_data=["term", "overlap", "period"],
color="term", color_discrete_map=color_discrete_map_terms
)
fig.update_traces(jitter=0, showlegend=False)
fig.update_layout(title="Variation of the overlap counts (by term)",
height=300, showlegend=False, template="simple_white")
else:
# by word
fig = px.box(df,
y="overlap", x="term",
#facet_col="period",
notched=False,
points="all",
hover_data=["term", "overlap", "period"],
color="term", color_discrete_map=color_discrete_map_terms
)
fig.update_traces(jitter=0, showlegend=False)
fig.update_layout(title="Variation of the overlap counts (by term)", height=300, showlegend=False)
fig.show()
#terms_colors = [ color_discrete_map_terms[t] for t in overs_df_all["term"] ]
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
line = dict(color = "black", width = 0),
label = list(overs_df["source"].unique())
),
# Add links
link = dict(
source = overs_df["source"].cat.codes,
target = overs_df["target"].cat.codes,
value = overs_df["count"],
label = overs_df["term"],
))])
fig.update_layout(title_text="Total diachronic collocation overlap", font_size=20)
fig.show()
# prepare sankey chart of collocational overlap for every term in our set
sankeys = []
for term in socio_political_terms:
overs_df_this = overs_df[overs_df["term"] == term].copy()
terms_colors = [ color_discrete_map_terms[t] for t in overs_df_this["term"] ]
fig = go.Sankey(
arrangement="snap",
node = dict(
pad = 0,
thickness = 30,
line = dict(color = "black", width = 0.5),
x=[x*10 for x in range(len(periods),1)], #we force the order in which periods are plotted,
y=[i*10 for i in range(len(periods)) ],
label = overs_df_this["source"].unique(),
),
# Add links
link = dict(
source = overs_df_this["source"].cat.codes,
target = overs_df_this["target"].cat.codes,
value = overs_df_this["count"],
line = dict(color = terms_colors, width = 0)
))
sankeys.append(fig)
fig, rows_cols = build_multiplot(3, "sankey", len(sankeys), socio_political_terms,
shared_yaxes=True, shared_xaxes=False, vertical_spacing=0.04)
for i, sankey in enumerate(sankeys):
fig.add_trace(sankey, row=rows_cols[i][0], col=rows_cols[i][1])
fig.update_layout(height=1200, title_text="Diachronic collocation overlap by term")
fig.show()
Until now, we have treated diachronic collocation sets as unstructured monoliths. In this section, we resort to word embeddings and clutering techniques in order to investigate fine-grained diachronic changes.
We will employ word2vec embeddings to assess semantic distance between collocates.
model_file = '/home/krzys/Kod/streamlit/voces/data/models/latinise_IT/latinise_w2v_v100w5min5'
if rebuild == True:
collocs_corpus = CorpusFromDir("/media/HOME_FOLDERS/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas")
collocs_model = BuildModels(collocs_corpus)
mods = dict(word2vec=dict(vector_size=[100], # define parameters
window=[5],
min_count=[5]))
latinise_w2v_v100w5min5 = collocs_model.build_many(mods)
latinise_w2v_v100w5min5["word2vec"][0].save(model_file)
model = latinise_w2v_v100w5min5["word2vec"][0]
elif rebuild == False:
model = Word2Vec.load(model_file)
We can now annotate collocates with vectors retrieved from the word2vec model.
all_collocs_time = pd.concat(collocs_time_by_term_dfs) # all collocates for all terms for all periods
collocs = all_collocs_time["colloc"].unique() # only collocates
# find word2vec vectors for each collocate
coll_vecs = [ model.wv[x]
if x in model.wv.key_to_index.keys() else np.repeat(None, model.wv.vector_size)
for x in collocs ]
coll_vecs_df=pd.DataFrame(coll_vecs)
coll_vecs_df.index = collocs
not_in_model = coll_vecs_df.isna().all(axis=1)
print( len(coll_vecs_df.loc[not_in_model].index), " words weren't found in the word2vec dictionary, eg. ", [ x for x in coll_vecs_df.loc[not_in_model].index[0:10] ] )
coll_vecs_df = coll_vecs_df.loc[~not_in_model] # exclude collocates which are not found in the word2vec dictionary
coll_vecs_df.head()
193 words weren't found in the word2vec dictionary, eg. ['ahitofel', 'l.', 'm.', 'c.', 'q.', 'p.', 'a.', 'cn.', 'd.', 't.']
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| princeps | 1.499599 | 1.654052 | 0.772930 | 1.065428 | 0.207113 | 1.949996 | 0.902307 | -0.708749 | -0.655868 | 0.772048 | ... | -1.696397 | -0.042469 | -0.265360 | 1.599047 | 1.692026 | -0.218211 | -0.866921 | 0.780210 | -0.327714 | 0.286641 |
| dono | -0.028665 | 0.206296 | -0.636952 | -0.976660 | -1.009025 | 1.534521 | -0.018601 | 0.419318 | 1.731939 | 0.566920 | ... | -0.035175 | 0.146910 | 1.603500 | 1.170355 | 1.732852 | -1.009020 | 0.818321 | -0.086363 | -0.048242 | -1.025009 |
| noster | 1.572031 | -1.847746 | 0.483166 | 1.169696 | 0.515046 | 1.342866 | 4.558844 | 0.564918 | 1.128803 | -1.010498 | ... | -1.073775 | 0.294596 | 0.294962 | 0.136875 | 0.365881 | 0.431258 | -0.813561 | 1.461542 | 1.295876 | 0.666016 |
| in | -0.486388 | 1.539424 | -1.682679 | 1.438433 | 0.416782 | -0.036690 | -0.375583 | 0.840769 | 0.698870 | -0.866383 | ... | -3.265582 | -0.583095 | 0.264834 | -1.441967 | 1.034583 | -0.769255 | -1.308601 | -0.488309 | -0.014694 | 0.051321 |
| graecia | 0.400272 | 0.584794 | 0.923566 | 0.167011 | 1.140316 | 0.195778 | 1.496356 | -0.419286 | -1.794344 | 0.467281 | ... | -0.763984 | 0.557046 | -0.314156 | -0.671911 | 1.037812 | 0.144784 | 2.227822 | -0.035667 | 0.219159 | -0.333513 |
5 rows × 100 columns
Each collocate will be assigned to a cluster basing on the vector. In this study, we use k-means clustering with an arbitrary number of 10 clusters.
cluster_n = 10
kmeans = KMeans(n_clusters=cluster_n).fit_predict(coll_vecs_df) # (1) initialize the model and (2) fit and return predictions
kmeans_dict = dict(zip(coll_vecs_df.index ,kmeans)) # for easy access we create a dictionary: {"term", "cluster"}
cluster_color_map = { str(i) : px.colors.qualitative.Safe[i] for i in range(0,cluster_n+1) } # for each term we fix a color
We can now annotate all collocates with cluster label.
all_collocs_time["kmeans_cluster"] = pd.Categorical(all_collocs_time.apply(lambda x: str(kmeans_dict[x["colloc"]]) if x["colloc"] in kmeans_dict.keys() else None, axis=1 ))
all_collocs_time = all_collocs_time.reset_index(level=1, drop=True)
all_collocs_time = all_collocs_time.reset_index()
all_collocs_time = all_collocs_time.rename(columns={"index":"term"})
all_collocs_time
| term | colloc | slice | rank | kmeans_cluster | |
|---|---|---|---|---|---|
| 0 | civitas | princeps | -450-0 | 1 | 1 |
| 1 | civitas | dono | -450-0 | 2 | 6 |
| 2 | civitas | noster | -450-0 | 3 | 6 |
| 3 | civitas | in | -450-0 | 4 | 9 |
| 4 | civitas | graecia | -450-0 | 5 | 1 |
| ... | ... | ... | ... | ... | ... |
| 2733 | urbs | iuxta | 450-900 | 46 | 9 |
| 2734 | urbs | episcopus | 450-900 | 47 | 4 |
| 2735 | urbs | tunc | 450-900 | 48 | 9 |
| 2736 | urbs | castrum | 450-900 | 49 | 7 |
| 2737 | urbs | beo | 450-900 | 50 | 5 |
2738 rows × 5 columns
# print collocs for selected term
all_collocs_time[all_collocs_time["term"] == 'civitas'].head()
fig1 = showLongTable(all_collocs_time[(all_collocs_time["term"] == 'civitas') &
(all_collocs_time["slice"] == '-450-0')], show=False,
color="kmeans_cluster",
colormap=cluster_color_map
)
fig2 = showLongTable(all_collocs_time[(all_collocs_time["term"] == 'civitas') &
(all_collocs_time["slice"] == '0-450')], show=False,
color="kmeans_cluster",
colormap=cluster_color_map
)
fig3 = showLongTable(all_collocs_time[(all_collocs_time["term"] == 'civitas') &
(all_collocs_time["slice"] == '450-900')], show=False,
color="kmeans_cluster",
colormap=cluster_color_map
)
fig = make_subplots(rows=1, cols=3, specs=[[{"type": "table"},{"type": "table"},{"type": "table"}]])
fig.add_trace(fig1, row=1, col=1)
fig.add_trace(fig2, row=1, col=2)
fig.add_trace(fig3, row=1, col=3)
fig.update_layout(height=800,
title_text="Collocates by period"
)
fig.show()
all_collocs_time[(all_collocs_time["term"] == 'civitas')].to_csv('out/civitas_collocs.xlsx')
fig = px.histogram(all_collocs_time.sort_values("kmeans_cluster"), x="slice",
color="kmeans_cluster", color_discrete_map=cluster_color_map,
barmode="stack",
category_orders={"kmeans_cluster":'category ascending'},
)
fig.update_xaxes(title="Period", categoryorder="category ascending" )
fig.update_yaxes(title="Count")
fig.update_layout(title="Diachronic distribution of collocational clusters (all terms)")
fig.show()
fig = px.histogram(all_collocs_time.sort_values("kmeans_cluster"),
x="slice", color=("kmeans_cluster"), color_discrete_map=cluster_color_map,
barmode="stack",
category_orders={"kmeans_cluster":'category ascending'},
facet_col="term",
facet_col_wrap=3,
facet_row_spacing=0.06,
facet_col_spacing=0.06,)
fig.update_xaxes(title="Period", categoryorder="category ascending" )
fig.update_yaxes(title="Count")
fig.update_layout(height=1400, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
fig = px.histogram(all_collocs_time, x="slice", color="kmeans_cluster", barmode="group",
facet_col="term", color_discrete_map=cluster_color_map,
facet_col_wrap=2,
category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
facet_row_spacing=0.06,
facet_col_spacing=0.06,
)
fig.update_xaxes(title="Period", showticklabels=True)
fig.update_yaxes(title="Count")
fig.update_layout(height=2000, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
df = pd.DataFrame(all_collocs_time.groupby(["slice", "term"])["kmeans_cluster"].value_counts().reset_index())
df.columns = ["slice", "term","kmeans_cluster", "count"]
#df[df["term"] == 'civitas'].head()
fig = px.area(df, x="slice", color="kmeans_cluster", color_discrete_map=cluster_color_map,
y="count", category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
facet_col="term", facet_col_wrap=4,
facet_row_spacing=0.06, facet_col_spacing=0.04,
)
fig.update_xaxes(title="", categoryorder="category ascending", showticklabels=True, tickangle=45)
fig.update_layout(height=1400, showlegend=True, title="Diachronic distribution of collocational clusters (by term)")
fig.show()
fig = px.histogram(all_collocs_time,
x="kmeans_cluster", color="kmeans_cluster",
barmode="group",
facet_col="term",
color_discrete_map=cluster_color_map,
facet_col_wrap=5,
animation_frame="slice", animation_group="kmeans_cluster",
category_orders={"kmeans_cluster":[str(i) for i in range(0,cluster_n+1) ]},
facet_row_spacing=0.04, facet_col_spacing=0.04,
)
fig.update_layout(height=800, showlegend=True, title="Interactive diachronic distribution of collocational clusters (by term)")
fig.show()
Let's assume that the distribution of collocational clusters of a term at time_i corresponds to its semantics at this point in time. Diachronic variation of the counts may help us in assessing if the term was subject to important sense changes.
TODO It might be better, though, to evaluate the cross-cluster variation rather than one-cluster counts.
fig = px.box(df.sort_values("kmeans_cluster"), x="kmeans_cluster",
color="kmeans_cluster", color_discrete_map=cluster_color_map,
y="count",
facet_col="term", facet_col_wrap=2,
facet_row_spacing=0.02, facet_col_spacing=0.02,
category_orders={"kmeans_cluster":'category ascending'},
)
fig.update_xaxes(title="", categoryorder="category ascending", showticklabels=True, tickangle=45)
fig.update_layout(height=2000, showlegend=True, title="Variation of collocational clusters counts (by cluster)")
fig.show()
# TODO: variation by genre
We're evaluating semantic (dis)similarity of collocates based on vectors retrieved from the word2vec model. To facilitate analyses, we are building similarity matrix for all collocates at once.
print("The model we'll be using: ", model, ".\n")
similarity_matrix = np.matrix([ model.wv.cosine_similarities(vec1, coll_vecs_df) for vec1 in coll_vecs_df.to_numpy() ])
print("Our similarity matrix has shape: ", similarity_matrix.shape, "\n", similarity_matrix[0:2])
#dist_matrix = distance.pdist(np.ndarray([ coll_vecs_df[0:2], coll_vecs_df[0:2]]).dropna(), metric='cosine')
The model we'll be using: Word2Vec<vocab=40504, vector_size=100, alpha=0.025> . Our similarity matrix has shape: (1230, 1230) [[1. 0.29489333 0.25937001 ... 0.33302709 0.1681883 0.3654971 ] [0.29489333 1. 0.13147971 ... 0.01036476 0.03372622 0.21049999]]
Also, for plotting purposes, we are reducing vectors to 2 dimensions with the t-SNE.
tsne_df = all_collocs_time[["colloc", "slice", "term", "rank"]].reset_index(drop=True)
tsne_df = tsne_df.set_index("colloc", drop=False)
tsne_df["vec"] = [ coll_vecs_df.loc[colloc] if colloc in coll_vecs_df.index else None for colloc in tsne_df["colloc"] ]
tsne_df = tsne_df[ ~ tsne_df["vec"].isna()] # exclude collocates not in the word2vec dict
# fit TSNE
vecs = tsne_df["vec"]
vecs = np.array([ vec for vec in vecs ])
tsne = TSNE(n_components=2, random_state=0,
init='pca', learning_rate="auto", metric="cosine", perplexity=len(periods))
coords = tsne.fit_transform( vecs )
# add (x,y) coordinates to each row
tsne_df["x"] = [ coord[0] for coord in coords ]
tsne_df["y"] = [ coord[1] for coord in coords ]
tsne_df["rank"] = pd.to_numeric(tsne_df["rank"])
tsne_df.head()
/home/krzys/miniconda3/envs/lvlt22/lib/python3.10/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.
| colloc | slice | term | rank | vec | x | y | |
|---|---|---|---|---|---|---|---|
| colloc | |||||||
| princeps | princeps | -450-0 | civitas | 1 | 0 1.499599 1 1.654052 2 0.772930 3... | 96.779152 | 110.710159 |
| dono | dono | -450-0 | civitas | 2 | 0 -0.028665 1 0.206296 2 -0.636952 3... | -63.424732 | -34.199081 |
| noster | noster | -450-0 | civitas | 3 | 0 1.572031 1 -1.847746 2 0.483166 3... | -99.519875 | 102.313583 |
| in | in | -450-0 | civitas | 4 | 0 -0.486388 1 1.539424 2 -1.682679 3... | -116.270370 | -79.258522 |
| graecia | graecia | -450-0 | civitas | 5 | 0 0.400272 1 0.584794 2 0.923566 3... | 87.686058 | -13.885004 |
Diachronic collocations may be clustered based on their semantic similarity.
# all: only strongest collocates
topn = 10
tsne_df[tsne_df["rank"] <= topn ]
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y", color="slice", text="colloc", facet_col="term",
facet_col_wrap=3, facet_row_spacing=0.01)
fig.update_layout(height=2000)
fig.show()
The same data may be visualized dynamically.
# all: only strongest collocates
topn = 20
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y", color="slice", text="colloc", facet_col="term",
animation_frame="slice",
facet_col_wrap=3, facet_row_spacing=0.01)
fig.update_layout(height=2000, showlegend=False)
fig.show()
Usually, it is easier to analyze terms one by one.
# all: only strongest collocates
topn = 25
term = 'civitas'
fig = px.scatter(tsne_df[ (tsne_df["rank"] <= topn) & (tsne_df["term"] == term) ],
x="x", y="y", color="slice", text="colloc",
animation_frame="slice",
)
fig.update_traces(marker_size=1, textfont=dict(size=12))
fig.update_layout(height=400, showlegend=False, template = "simple_white", )
fig.show()
# all: only strongest collocates
topn = 25
term = 'civitas'
fig = px.scatter(tsne_df[ (tsne_df["rank"] <= topn) & (tsne_df["term"] == term) ],
x="x", y="y", color="slice", text="colloc",
facet_row="slice"
#animation_frame="slice",
)
fig.update_traces(marker_size=1, textfont=dict(size=10))
fig.update_layout(height=600, width = 800, showlegend=False, template = "simple_white", )
fig.show()
Let's evaluate semantic similarity of all collocates of all terms for each period.
TODO The plot is, however, hardly legible and difficult to interpret.
# all: only strongest collocates - by period - unreadable
topn = 10
tsne_df[tsne_df["rank"] <= topn ]
fig = px.scatter(tsne_df[tsne_df["rank"] <= topn], x="x", y="y",
color="term", text="colloc", color_discrete_map=color_discrete_map_terms,
#facet_col="term",
facet_row="slice",
facet_col_wrap=2,
facet_row_spacing=0.01)
fig.update_layout(height=2000, title="Semantic relatedness of collocates through time")
fig.show()
Let's assume that semantic value of a collocational set at the point t_i is approximated by the mean of the vectors of its elements.
# TODO: Evaluate if and how the mean is correlated with the rank of collocates included in the set.
First, we calculate vector means for each term-period.
df_sims = tsne_df[~ tsne_df["vec"].isna() ].groupby(["term","slice"])["vec"].aggregate(lambda x: cosine_similarity([y for y in x]) ).reset_index()
df_sims["mean_sim"] = df_sims.apply(lambda x: np.mean(np.tril(x["vec"], k=-1)), axis=1 ) # get the lower triangle of the matrix
df_sims["std_sim"] = df_sims.apply(lambda x: np.std(np.tril(x["vec"], k=-1)), axis=1 )
We are now ready to plot the means for each term for each period. If the mean of collocation set vectors significantly changes, we may suppose the meaning of the term changed as well.
fig = px.line(
df_sims,
x="slice",
y="mean_sim",
color="term", color_discrete_map=color_discrete_map_terms,
facet_col="term", facet_col_wrap=3, facet_row_spacing=0.09, facet_col_spacing=0.06
) # collocational set coherence
fig.update_layout(height=1200, showlegend=False)
fig.update_yaxes(title="vectors mean")
fig.update_xaxes(title="period", showticklabels=True, tickangle=45)
fig.show()
If the vectors mean is indicative of the word meaning, we may calculate the cosine similarity of the term-periods and plot the on the 2D plane by reducing the vectors with the t-SNE.
df_sims_vecs = tsne_df[~ tsne_df["vec"].isna() ].copy().groupby(["term","slice"])["vec"].aggregate(vec_mean=lambda x: np.mean( [y for y in x], axis = 0 ) ).reset_index()
# computet coordinates
tsne = TSNE(n_components=2, random_state=0, init='pca', learning_rate="auto", metric="cosine", perplexity=len(periods)+1)
coords = tsne.fit_transform( np.array([ x for x in df_sims_vecs["vec_mean"] ]) )
df_sims_vecs["x"] = [coord[0] for coord in coords]
df_sims_vecs["y"] = [coord[1] for coord in coords]
df_sims_vecs["label"] = df_sims_vecs.apply(lambda x: x["term"] + ":" + x["slice"], axis=1)
df_sims_vecs.head()
/home/krzys/miniconda3/envs/lvlt22/lib/python3.10/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.
| term | slice | vec_mean | x | y | label | |
|---|---|---|---|---|---|---|
| 0 | civitas | -450-0 | [0.7331563708465546, 0.5749529541656375, 0.665... | 5.269683 | -15.128561 | civitas:-450-0 |
| 1 | civitas | 0-450 | [0.15268317833542824, 0.481530849263072, 0.062... | -45.265285 | 2.392575 | civitas:0-450 |
| 2 | civitas | 450-900 | [0.13842779921367765, 0.6136744904518128, 0.46... | -42.438435 | 3.893654 | civitas:450-900 |
| 3 | consilium | -450-0 | [0.6008564168587327, 0.45102046817541125, 0.59... | 12.957879 | -11.167262 | consilium:-450-0 |
| 4 | consilium | 0-450 | [0.7520405086023467, 0.19572401635957007, 0.62... | 15.846694 | -11.784343 | consilium:0-450 |
fig = px.scatter(df_sims_vecs, x="x", y="y",
color="term", text="label", color_discrete_map=color_discrete_map_terms,
)
fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top center'))
fig.update_traces(mode="text")
fig.update_layout(height=800, showlegend=False, title="Distance of term-periods vector means (t-SNE)")
fig.show()
if prettyprint:
fig = px.scatter(df_sims_vecs, x="x", y="y",
color="term", text="label", color_discrete_map=color_discrete_map_terms,
)
fig.for_each_trace(lambda t: t.update(textfont_color=t.marker.color, textposition='top center'))
fig.update_traces(mode="text")
fig.update_layout(height=500, width=800,
template="simple_white",
showlegend=False, title="Distance of term-periods vector means (t-SNE)")
fig.show()
The cosine similarity of term-periods may be also investigated on a heatmap.
# similarity between periods and terms
df_sims_vecs_cosine = cosine_similarity([ row for row in df_sims_vecs["vec_mean"] ])
labels = [ row for row in df_sims_vecs["label"] ]
fig = go.Figure(
data=go.Heatmap(
x=labels,
y=labels,
z=df_sims_vecs_cosine
)
)
fig.update_layout(height=1000, title="Semantic relatedness of term-periods (vector means)")
fig.update_yaxes(categoryorder='category descending')
fig.update_xaxes(categoryorder='category ascending')
fig.show()
# heatmap with dendrogram following the example at https://plotly.com/python/dendrogram/#plot-a-dendrogram-with-a-heatmap
data_array = df_sims_vecs_cosine
data_array = data_array.transpose()
labels = [ row for row in df_sims_vecs["label"] ]
# Initialize figure by creating upper dendrogram
fig = ff.create_dendrogram(data_array, orientation='bottom', labels=labels)
for i in range(len(fig['data'])):
fig['data'][i]['yaxis'] = 'y2'
# Create Side Dendrogram
dendro_side = ff.create_dendrogram(data_array, orientation='right')
for i in range(len(dendro_side['data'])):
dendro_side['data'][i]['xaxis'] = 'x2'
# Add Side Dendrogram Data to Figure
for data in dendro_side['data']:
fig.add_trace(data)
# Create Heatmap
dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
data_dist = distance.pdist(data_array)
heat_data = distance.squareform(data_dist)
heat_data = heat_data[dendro_leaves,:]
heat_data = heat_data[:,dendro_leaves]
heatmap = [
go.Heatmap(
x = dendro_leaves,
y = dendro_leaves,
z = heat_data,
colorscale = 'Blues'
)
]
heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']
# Add Heatmap Data to Figure
for data in heatmap:
fig.add_trace(data)
# Edit Layout
fig.update_layout({'width':800, 'height':800,
'showlegend':False, 'hovermode': 'closest',
})
# Edit xaxis
fig.update_layout(xaxis={'domain': [.15, 1],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'ticks':""})
# Edit xaxis2
fig.update_layout(xaxis2={'domain': [0, .15],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'showticklabels': False,
'ticks':""})
# Edit yaxis
fig.update_layout(yaxis={'domain': [0, .85],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'showticklabels': False,
'ticks': ""
})
# Edit yaxis2
fig.update_layout(yaxis2={'domain':[.825, .975],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'showticklabels': False,
'ticks':""})
# Plot!
fig.update_layout(height=1000, width=1000)
fig.show()
# qualitative flow of collocates
collocs_time_by_term_dfs["civitas"]
| colloc | slice | rank | |
|---|---|---|---|
| 0 | princeps | -450-0 | 1 |
| 1 | dono | -450-0 | 2 |
| 2 | noster | -450-0 | 3 |
| 3 | in | -450-0 | 4 |
| 4 | graecia | -450-0 | 5 |
| ... | ... | ... | ... |
| 145 | ipse | 450-900 | 46 |
| 146 | ingredior | 450-900 | 47 |
| 147 | dux | 450-900 | 48 |
| 148 | igitur | 450-900 | 49 |
| 149 | vel | 450-900 | 50 |
150 rows × 3 columns
Let's represent collocations of a term or a set of terms as a graph. The node set comprises of all the collocates of a terms or terms as well as the terms themselves. The edges link collocate nodes with the respective term nodes. The edges are assigned a weight attribute which defines nodes attraction and whose value is inversely proportional to the collocation rank: $1-(rank/max(rank))$.
TODO More rigorous definition.
net_df = pd.DataFrame()
for term, dataf in collocs_time_by_term_dfs.items():
dataf = dataf.assign(term = pd.Series([x for x in itertools.repeat(term,len(dataf.index))]))
net_df = pd.concat([net_df, dataf])
net_df['slice_term'] = net_df['term'] + '_' + net_df['slice'].str.replace('period','')
net_df = net_df.assign(weight = pd.to_numeric(1-pd.to_numeric(net_df["rank"])/pd.to_numeric(net_df["rank"]).max()))
net_df.head()
| colloc | slice | rank | term | slice_term | weight | |
|---|---|---|---|---|---|---|
| 0 | princeps | -450-0 | 1 | civitas | civitas_-450-0 | 0.98 |
| 1 | dono | -450-0 | 2 | civitas | civitas_-450-0 | 0.96 |
| 2 | noster | -450-0 | 3 | civitas | civitas_-450-0 | 0.94 |
| 3 | in | -450-0 | 4 | civitas | civitas_-450-0 | 0.92 |
| 4 | graecia | -450-0 | 5 | civitas | civitas_-450-0 | 0.90 |
Since collocate nodes may be linked to multiple terms, intuitively, the network may be used to assess degree of semantic relatedness between the terms. To increase legibility we will plot only 20 strongest collocates,
rank_thresh = 20
g = nx.from_pandas_edgelist(net_df[ pd.to_numeric(net_df["rank"]) <= rank_thresh ],source="colloc", target="term",edge_key="slice",
edge_attr=["weight", "slice"], create_using=nx.MultiGraph())
# remove low-degree nodes, ie. nodes that are linked to only 1 node
low_degree = [n for n, d in g.degree() if d < 2]
g.remove_nodes_from(low_degree)
node_colors = [ color_discrete_map_terms[node] if node in color_discrete_map_terms.keys() else '#808080' for node in g.nodes ]
node_sizes = [ 300 if node in color_discrete_map_terms.keys() else 0 for node in g.nodes ]
pos = nx.spring_layout(g, seed=1)
if prettyprint:
plt.figure(1,figsize=(14,8), dpi=800)
else:
plt.figure(1,figsize=(20,20))
nx.draw_networkx(g, pos, width=0.1, node_color=node_colors, node_size=node_sizes, font_size=8)
The networks may be inspected period by period.
graphs = []
rank_thresh = 20
for i, period in enumerate(periods[0:len(periods)]):
g = nx.from_pandas_edgelist(net_df[ (net_df["slice"] == period) & (pd.to_numeric(net_df["rank"]) <= rank_thresh) ],source="colloc", target="term",edge_key="slice",
edge_attr=["weight", "slice"], create_using=nx.MultiGraph())
# remove low-degree nodes
low_degree = [n for n, d in g.degree() if d < 2]
g.remove_nodes_from(low_degree)
node_colors = [ color_discrete_map_terms[node] if node in color_discrete_map_terms.keys() else '#808080' for node in g.nodes ]
node_sizes = [ 300 if node in color_discrete_map_terms.keys() else 0 for node in g.nodes ]
font_sizes = [ 12 if node in color_discrete_map_terms.keys() else 4 for node in g.nodes ]
pos = nx.spring_layout(g, seed=675, k=0.99)
graphs.append((g, pos, node_colors, node_sizes, period))
ncols = 2
nrows = divmod(len(graphs), ncols)[0] if divmod(len(graphs), ncols)[1] == 0 else divmod(len(graphs), ncols)[0] + 1
print(ncols, nrows)
if prettyprint:
fig, axs = plt.subplots( nrows , ncols, sharex=True, figsize=(30, 20), dpi=300)
#plt.figure(1,figsize=(14,8), dpi=800)
else:
fig, axs = plt.subplots( nrows , ncols, sharex=True, figsize=(20, 30))
for i, graph in enumerate(graphs):
ax = axs.flatten()
ax[i].set_title(graphs[i][4])
nx.draw_networkx(graphs[i][0], graphs[i][1], width=0.1, node_color=graphs[i][2],
node_size=graphs[i][3],
font_size=12,
ax=ax[i])
2 2
Likewise, the (dis)similarity may be evaluated for term-periods. This time, we are using nx.algorithms.community.greedy_modularity_communities algorithm with to detect "communities" (ie. sense clusters) in the network.
TODO Improve legibility or remove.
rank_thresh = 10
g = nx.from_pandas_edgelist(net_df[ pd.to_numeric(net_df["rank"]) <= rank_thresh ],
source="colloc", target="slice_term",
edge_key="slice",
edge_attr=["weight", "slice"], create_using=nx.MultiDiGraph())
# remove low-degree nodes
low_degree = [n for n, d in g.degree() if d < 2]
g.remove_nodes_from(low_degree)
G = g.copy()
communities = nx.algorithms.community.greedy_modularity_communities(G)
communities_cols = [ px.colors.qualitative.Alphabet[i] for i, v in enumerate(communities) ]
communities_dict = {}
for i, comm in enumerate(communities):
for com in comm:
communities_dict.setdefault(com, communities_cols[i])
pos = nx.spring_layout(G,k=0.01)
plt.figure(1,figsize=(14,14))
for node in G.nodes:
lbl = node
#lbl = node.split('_')[0]
col = communities_dict[lbl]
nx.draw_networkx_labels(G, pos=pos, labels={node:node}, font_color=col)
nx.draw_networkx_nodes(G, pos,
node_size=0,
label=[ n for n in G.nodes ],
node_color = communities_dict.values())
<matplotlib.collections.PathCollection at 0x7f8d41c7c5e0>
# interactive, but slooow!
def plot_pyvis_graph():
g_ = network.Network(notebook=True, width="100%")
g_.from_nx(g)
g_.show_buttons()
for node in g_.nodes:
if node["label"] in color_discrete_map_terms.keys():
node["color"] = color_discrete_map_terms[node["label"]]
g_.show('collocs.html')
# uncomment to plotTODOTODO
# plot_pyvis_graph()